In [89]:
import pandas as pd
import matplotlib.pyplot as plt
# Load datasets
df_parking = pd.read_csv('data/parking/parkingNYC_Jan2025.csv')
df_codes = pd.read_csv('data/parking/Parking_ViolationCodes_January2020.csv')
/var/folders/46/fdrjqjdn3jq9yg03908v7r_80000gn/T/ipykernel_39145/786070329.py:5: DtypeWarning: Columns (36) have mixed types. Specify dtype option on import or set low_memory=False.
df_parking = pd.read_csv('data/parking/parkingNYC_Jan2025.csv')
In [90]:
df_codes.columns
Out[90]:
Index(['VIOLATION CODE', 'VIOLATION DESCRIPTION',
'Manhattan 96th St. & below\n(Fine Amount $)',
'All Other Areas\n(Fine Amount $)'],
dtype='object')
In [91]:
df_parking['Violation Code'] = df_parking['Violation Code'].astype(int)
df_codes['Violation Code'] = df_codes['VIOLATION CODE'].astype(int)
df = df_parking.merge(
df_codes[['Violation Code', 'VIOLATION DESCRIPTION', 'Manhattan 96th St. & below\n(Fine Amount $)']],
on='Violation Code',
how='left'
)
In [92]:
df['Fine'] = pd.to_numeric(df['Manhattan 96th St. & below\n(Fine Amount $)'], errors='coerce')
In [93]:
violation_counts = df['Violation Description'].value_counts().head(10)
plt.figure(figsize=(10,6))
violation_counts.plot(kind='bar')
plt.ylabel('Number of Violations')
plt.title('Top 10 Most Common Violation Types')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# --- Top 10 Violation Types by Revenue ---
# Calculate total revenue per violation type
revenue = df.groupby('Violation Description')['Fine'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,6))
revenue.plot(kind='bar')
plt.ylabel('Total Revenue')
plt.title('Top 10 Violation Types by Total Revenue')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# For reference, you might also print out the rankings:
print("Top 10 Violations by Count:")
print(violation_counts)
print("\nTop 10 Violations by Revenue:")
print(revenue)
Top 10 Violations by Count: Violation Description 38-Failure to Dsplay Meter Rec 25583 14-No Standing 23598 21-No Parking (street clean) 20547 31-No Stand (Com. Mtr. Zone) 16084 69-Fail to Dsp Prking Mtr Rcpt 14830 20A-No Parking (Non-COM) 10020 20-No Parking (Com Plate) 9010 40-Fire Hydrant 8926 46B-Double Parking (Com-100Ft) 7309 71A-Insp Sticker Expired (NYS) 6303 Name: count, dtype: int64 Top 10 Violations by Revenue: Violation Description 14-No Standing 2713770.0 31-No Stand (Com. Mtr. Zone) 1849660.0 38-Failure to Dsplay Meter Rec 1662895.0 21-No Parking (street clean) 1335555.0 40-Fire Hydrant 1026490.0 69-Fail to Dsp Prking Mtr Rcpt 963950.0 46B-Double Parking (Com-100Ft) 840535.0 20A-No Parking (Non-COM) 651300.0 20-No Parking (Com Plate) 585650.0 19-No Stand (bus stop) 571435.0 Name: Fine, dtype: float64
In [94]:
df.columns
Out[94]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
'Violation Time', 'Time First Observed', 'Violation County',
'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
'Intersecting Street', 'Date First Observed', 'Law Section',
'Sub Division', 'Violation Legal Code', 'Days Parking In Effect',
'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
'Feet From Curb', 'Violation Post Code', 'Violation Description',
'No Standing or Stopping Violation', 'Hydrant Violation',
'Double Parking Violation', 'issue_date', 'year', 'month', 'day',
'VIOLATION DESCRIPTION', 'Manhattan 96th St. & below\n(Fine Amount $)',
'Fine'],
dtype='object')
In [95]:
df_non_commercial = df[df['Plate Type'] == 'PAS']
import seaborn as sns
df_filtered = df_non_commercial[
(df_non_commercial['Vehicle Year'] >= 1990) &
(df_non_commercial['Vehicle Year'] <= 2025)
]
# Example: limit to a handful of common colors
common_colors = ['BLACK', 'WHITE', 'GRAY', 'SILVER', 'RED',
'BLUE', 'GREEN', 'BROWN', 'BEIGE', 'GOLD']
df_filtered = df_filtered[df_filtered['Vehicle Color'].isin(common_colors)]
# 2. Compute average fine by (Color, Year)
avg_fine = df_filtered.groupby(['Vehicle Color', 'Vehicle Year'])['Fine'].mean().reset_index()
# 3. Pivot so that rows = Vehicle Color, columns = Vehicle Year, values = Average Fine
heat_data = avg_fine.pivot(index='Vehicle Color', columns='Vehicle Year', values='Fine')
# 4. Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heat_data, cmap='YlOrRd', annot=True, fmt=".1f")
plt.title('Average Fine by Vehicle Year and Vehicle Color (Non-Commercial Vehicles)')
plt.xlabel('Vehicle Year')
plt.ylabel('Vehicle Color')
plt.tight_layout()
plt.show()
Findings: 1 The average fines vary across different vehicle years and colors 2 Some color–year pairs (for example early 2020s) show higher average fines. 3 Some of the lighter cells may reflect very few tickets issued (no data or less reliable averages).
In [96]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import shapely.geometry
gdf_precincts = gpd.read_file('data/police_precincts/geojson/NYC_police_precincts_Mar2025.json')
manhattan_box = shapely.geometry.box(-74.02, 40.70, -73.93, 40.88)
gdf_precincts.head
Out[96]:
<bound method NDFrame.head of OBJECTID Precinct Shape__Area Shape__Length \
0 1 1 4.728475e+07 80093.490309
1 2 5 1.809477e+07 18807.080607
2 3 6 2.214302e+07 27255.991120
3 4 7 1.836663e+07 17287.655061
4 5 9 2.139539e+07 19772.510741
.. ... ... ... ...
72 73 115 1.141415e+08 60082.476284
73 74 116 1.644177e+08 73462.652558
74 75 120 2.323334e+08 92949.901858
75 76 121 4.755776e+08 136811.464636
76 77 122 4.548466e+08 154846.589526
geometry
0 MULTIPOLYGON (((-74.04388 40.6902, -74.04351 4...
1 POLYGON ((-73.98864 40.72294, -73.98869 40.722...
2 POLYGON ((-74.00915 40.74252, -74.00895 40.742...
3 POLYGON ((-73.97346 40.71897, -73.97357 40.718...
4 POLYGON ((-73.97161 40.72673, -73.97163 40.726...
.. ...
72 POLYGON ((-73.85892 40.76241, -73.85931 40.762...
73 POLYGON ((-73.72586 40.68325, -73.72591 40.682...
74 POLYGON ((-74.05357 40.6037, -74.05407 40.6035...
75 MULTIPOLYGON (((-74.15946 40.64146, -74.15975 ...
76 MULTIPOLYGON (((-74.05051 40.56643, -74.05047 ...
[77 rows x 5 columns]>
In [97]:
gdf_manhattan = gdf_precincts[gdf_precincts.centroid.within(manhattan_box)]
df['Precinct'] = df['Violation Precinct']
# --- Part (a): Aggregate Data by Precinct ---
# Compute total tickets and total fines per precinct:
precinct_stats = df.groupby('Precinct').agg(
ticket_count=('Precinct', 'count'),
total_fine=('Fine', 'sum')
).reset_index()
precinct_stats['avg_fine'] = precinct_stats['total_fine'] / precinct_stats['ticket_count']
# Ensure that the "Precinct" column types match between your data and the precinct boundaries.
gdf_manhattan['Precinct'] = gdf_manhattan['Precinct'].astype(int)
precinct_stats['Precinct'] = precinct_stats['Precinct'].astype(int)
# Merge the aggregated stats with Manhattan precinct geometries
gdf_stats = gdf_manhattan.merge(precinct_stats, on='Precinct', how='left')
# --- 3. Produce Choropleth Maps for Part (a) ---
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Map 1: Total Number of Tickets
gdf_stats.plot(column='ticket_count', cmap='Blues', linewidth=0.8, ax=axes[0], edgecolor='0.8', legend=True)
axes[0].set_title('Total Number of Tickets')
# Map 2: Total Amount of Fines
gdf_stats.plot(column='total_fine', cmap='Greens', linewidth=0.8, ax=axes[1], edgecolor='0.8', legend=True)
axes[1].set_title('Total Amount of Fines')
# Map 3: Average Fine Amount
gdf_stats.plot(column='avg_fine', cmap='Reds', linewidth=0.8, ax=axes[2], edgecolor='0.8', legend=True)
axes[2].set_title('Average Fine Amount')
plt.tight_layout()
plt.show()
# --- Part (b): Faceted Maps by Violation Group ---
# Aggregate violation counts per precinct and violation_group:
group_stats = df.groupby(['Precinct', 'Violation Description']).size().reset_index(name='ticket_count')
# Merge with Manhattan precinct boundaries
gdf_violation = gdf_manhattan.merge(group_stats, on='Precinct', how='left')
# Determine unique violation groups
violation_groups = gdf_violation['Violation Description'].dropna().unique()
n_groups = len(violation_groups)
ncols = 2 # Adjust the number of columns in the facet grid as needed
nrows = (n_groups + ncols - 1) // ncols
fig, axes = plt.subplots(nrows, ncols, figsize=(12, nrows * 5))
axes = axes.flatten()
for i, group in enumerate(violation_groups):
subset = gdf_violation[gdf_violation['Violation Description'] == group]
subset.plot(column='ticket_count', cmap='OrRd', linewidth=0.8, ax=axes[i], edgecolor='0.8', legend=True)
axes[i].set_title(f'Violation Group: {group}')
# Remove any extra subplots if there are leftover axes
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
/var/folders/46/fdrjqjdn3jq9yg03908v7r_80000gn/T/ipykernel_39145/1036204900.py:1: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation. gdf_manhattan = gdf_precincts[gdf_precincts.centroid.within(manhattan_box)] /opt/anaconda3/lib/python3.12/site-packages/geopandas/geodataframe.py:1819: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy super().__setitem__(key, value)
The total number of tickets and total amount of fines track similarly, but precincts with relatively fewer tickets can still show high total fines if those violations are expensive.
The average fine can be skewed by a small number of high-penalty tickets, so it doesn’t always reflect overall enforcement activity.
The total amount of fines is often the most representative because it balances how many tickets were issued with how severe the penalties were.
In [100]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
df_hydrant = df[(df['Violation Code'] == 40) & (df['Violation Precinct'] == 19)].copy()
df_hydrant['full_address'] = (df_hydrant['House Number'].astype(str) + ' ' +
df_hydrant['Street Name'] + ', Upper East Side, Manhattan, NY')
df_hydrant = df_hydrant.head(500).reset_index(drop=True)
geolocator = Nominatim(user_agent="parking_app", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=5, error_wait_seconds=2)
def get_lat_lon(address):
try:
location = geocode(address)
if location:
return pd.Series({'latitude': location.latitude, 'longitude': location.longitude})
except Exception as e:
print(e)
return pd.Series({'latitude': None, 'longitude': None})
geocoded = df_hydrant['full_address'].apply(get_lat_lon)
df_geocoded = pd.concat([df_hydrant, geocoded], axis=1)
print(df_geocoded[['Summons Number', 'House Number', 'Street Name', 'full_address', 'latitude', 'longitude']].head())
Summons Number House Number Street Name \
0 9175839120 N E 83rd St
1 9175839179 527 E 78th St
2 9175839192 513 E 75th St
3 9176447868 N E 74th St
4 9176447972 303 E 93rd St
full_address latitude longitude
0 N E 83rd St, Upper East Side, Manhattan, NY 40.776282 -73.954015
1 527 E 78th St, Upper East Side, Manhattan, NY 40.770100 -73.949246
2 513 E 75th St, Upper East Side, Manhattan, NY 40.768545 -73.951226
3 N E 74th St, Upper East Side, Manhattan, NY 40.770973 -73.959431
4 303 E 93rd St, Upper East Side, Manhattan, NY 40.782254 -73.947980
In [101]:
import folium
map_center = [40.7736, -73.9566]
m = folium.Map(location=map_center, zoom_start=14)
for _, row in df_geocoded.iterrows():
popup_info = (f"Summons: {row['Summons Number']}<br>"
f"Vehicle Make: {row['Vehicle Make']}<br>"
f"Violation: {row['Violation Description']}")
if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=popup_info,
).add_to(m)
# Display the interactive map
m.save("hydrant_violations_map.html")
In [102]:
# Define car brands
luxury_brands = ['BMW', 'MERCEDES', 'AUDI', 'LEXUS', 'PORSCHE', 'JAGUAR', 'LAND ROVER', 'INFINITI', 'ACURA']
df_geocoded['luxury_car'] = df_geocoded['Vehicle Make'].str.upper().isin(luxury_brands)
# updated map
m2 = folium.Map(location=map_center, zoom_start=14)
# Define marker colors
def marker_color(is_luxury):
return 'red' if is_luxury else 'blue'
for _, row in df_geocoded.iterrows():
popup_info = (f"Summons: {row['Summons Number']}<br>"
f"Vehicle Make: {row['Vehicle Make']}<br>"
f"Violation: {row['Violation Description']}<br>"
f"Luxury Car: {'Yes' if row['luxury_car'] else 'No'}")
if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=popup_info,
icon=folium.Icon(color=marker_color(row['luxury_car']))
).add_to(m2)
legend_html = """
<div style="position: fixed;
bottom: 50px; left: 50px; width: 150px; height: 90px;
border:2px solid grey; z-index:9999; font-size:14px;
background-color:white;
">
<b>Car Type</b><br>
<i class="fa fa-map-marker fa-2x" style="color:red"></i> Luxury<br>
<i class="fa fa-map-marker fa-2x" style="color:blue"></i> Non-Luxury
</div>
"""
m2.get_root().html.add_child(folium.Element(legend_html))
m2.save("hydrant_violations_luxury_map.html")
In [103]:
from folium.plugins import MarkerCluster
m3 = folium.Map(location=map_center, zoom_start=14)
marker_cluster = MarkerCluster().add_to(m3)
for _, row in df_geocoded.iterrows():
popup_info = (f"Summons: {row['Summons Number']}<br>"
f"Vehicle Make: {row['Vehicle Make']}<br>"
f"Violation: {row['Violation Description']}<br>"
f"Luxury Car: {'Yes' if row['luxury_car'] else 'No'}")
if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
folium.Marker(
location=[row['latitude'], row['longitude']],
popup=popup_info,
icon=folium.Icon(color=marker_color(row['luxury_car']))
).add_to(marker_cluster)
m3.get_root().html.add_child(folium.Element(legend_html))
m3.save("hydrant_violations_clustered_map.html")
In [ ]: